{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%run NB03-EDA-MetaData-Check.ipynb"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convert / Transform Dichotomous Variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds01_categorical_dichotomous_column_names = [\n",
    "'CODE_GENDER', # after preliminary manual fix\n",
    "'FLAG_CONT_MOBILE',\n",
    "'FLAG_DOCUMENT_2',\n",
    "'FLAG_DOCUMENT_3',\n",
    "'FLAG_DOCUMENT_4',\n",
    "'FLAG_DOCUMENT_5',\n",
    "'FLAG_DOCUMENT_6',\n",
    "'FLAG_DOCUMENT_7',\n",
    "'FLAG_DOCUMENT_8',\n",
    "'FLAG_DOCUMENT_9',\n",
    "'FLAG_DOCUMENT_10',\n",
    "'FLAG_DOCUMENT_11',\n",
    "'FLAG_DOCUMENT_12',\n",
    "'FLAG_DOCUMENT_13',\n",
    "'FLAG_DOCUMENT_14',\n",
    "'FLAG_DOCUMENT_15',\n",
    "'FLAG_DOCUMENT_16',\n",
    "'FLAG_DOCUMENT_17',\n",
    "'FLAG_DOCUMENT_18',\n",
    "'FLAG_DOCUMENT_19',\n",
    "'FLAG_DOCUMENT_20',\n",
    "'FLAG_DOCUMENT_21',\n",
    "'FLAG_EMAIL',\n",
    "'FLAG_EMP_PHONE',\n",
    "'FLAG_MOBIL',\n",
    "'FLAG_OWN_CAR',\n",
    "'FLAG_OWN_REALTY',\n",
    "'FLAG_PHONE',\n",
    "'FLAG_WORK_PHONE',\n",
    "'LIVE_CITY_NOT_WORK_CITY',\n",
    "'LIVE_REGION_NOT_WORK_REGION',\n",
    "'REG_CITY_NOT_LIVE_CITY',\n",
    "'REG_CITY_NOT_WORK_CITY',\n",
    "'REG_REGION_NOT_LIVE_REGION',\n",
    "'REG_REGION_NOT_WORK_REGION'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds01_nonmodel_column_names = [\n",
    "'SK_ID_CURR',\n",
    "'TARGET'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for column_name in sorted(df.columns.tolist()):\n",
    "    if df[column_name].nunique() == 2:\n",
    "        print(df[column_name].value_counts(dropna=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp_column_names = list()\n",
    "for column_name in sorted(df.columns.tolist()):\n",
    "    if df[column_name].nunique() == 2:\n",
    "        tmp_column_names.append(column_name)\n",
    "print(sorted(list(set(tmp_column_names) - set(ds01_categorical_dichotomous_column_names))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for column_name in ds01_categorical_dichotomous_column_names:\n",
    "    if df[column_name].nunique() != 2:\n",
    "        print(df[column_name].value_counts(dropna=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for column_name in ds01_categorical_dichotomous_column_names:\n",
    "    if set(df[column_name].unique()) != set(['0','1']):\n",
    "        print(df[column_name].value_counts(dropna=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### CODE_GENDER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.loc[df['SK_ID_CURR'] == '141289', 'CODE_GENDER'] = 'F'\n",
    "df.loc[df['SK_ID_CURR'] == '319880', 'CODE_GENDER'] = 'F'\n",
    "df.loc[df['SK_ID_CURR'] == '196708', 'CODE_GENDER'] = 'F'\n",
    "df.loc[df['SK_ID_CURR'] == '144669', 'CODE_GENDER'] = 'M'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['CODE_GENDER'] = df['CODE_GENDER'].replace('M', 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['CODE_GENDER'] = df['CODE_GENDER'].replace('F', 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['CODE_GENDER'] = df['CODE_GENDER'].astype(str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### FLAG_OWN_CAR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['FLAG_OWN_CAR'] = df['FLAG_OWN_CAR'].replace('Y', 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['FLAG_OWN_CAR'] = df['FLAG_OWN_CAR'].replace('N', 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['FLAG_OWN_CAR'] = df['FLAG_OWN_CAR'].astype(str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### FLAG_OWN_REALTY"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['FLAG_OWN_REALTY'] = df['FLAG_OWN_REALTY'].replace('Y', 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['FLAG_OWN_REALTY'] = df['FLAG_OWN_REALTY'].replace('N', 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['FLAG_OWN_REALTY'] = df['FLAG_OWN_REALTY'].astype(str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Two-Way Frequency Tables"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### scipy.stats.fisher_exact[https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html]\n",
    "Notes  \n",
    "The calculated odds ratio is different from the one R uses.  \n",
    "This scipy implementation returns the (more common) “unconditional Maximum Likelihood Estimate”, while R uses the “conditional Maximum Likelihood Estimate”.  \n",
    "For tables with large numbers, the (inexact) chi-square test implemented in the function chi2_contingency can also be used.  \n",
    "\n",
    "\n",
    "Fisher's exact test is a statistical test used to determine if there are nonrandom associations between two categorical variables.  \n",
    "Weisstein, Eric W. \"Fisher's Exact Test.\" From MathWorld--A Wolfram Web Resource. http://mathworld.wolfram.com/FishersExactTest.html\n",
    "\n",
    "\n",
    "That p-value is for the null hypothesis of independence between the two categorical variables.  \n",
    "We reject the null of independence here.  \n",
    "For the odds ratio, if the confidence interval contains one, we fail to reject the null hypothesis of independence.  \n",
    "\"How to interpret Fisher Test?\"[https://stats.stackexchange.com/questions/220044/how-to-interpret-fisher-test]\n",
    "\n",
    "##### scipy.stats.chi2_contingency()[https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html]\n",
    "Notes  \n",
    "An often quoted guideline for the validity of this calculation is that the test should be used only if the observed and expected frequencies in each cell are at least 5.  \n",
    "\n",
    "\n",
    "...\n",
    "\n",
    "\n",
    "...calculate the fisher exact test to determine statistical significance...  \n",
    "\n",
    "\n",
    "http://hamelg.blogspot.com/2015/11/python-for-data-analysis-part-19_17.html\n",
    "\n",
    "\n",
    "http://www.biostathandbook.com/fishers.html\n",
    "\n",
    "\n",
    "http://www.stat.purdue.edu/~tqin/system101/method/method_fisher_sas.htm\n",
    "\n",
    "\n",
    "https://www.statsdirect.com/help/exact_tests_on_counts/fisher_exact.htm\n",
    "\n",
    "\n",
    "https://stats.stackexchange.com/questions/220044/how-to-interpret-fisher-test\n",
    "\n",
    "\n",
    "https://codereview.stackexchange.com/questions/186657/pandas-numpy-statistical-odds-ratio-test\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for column_name in ds01_categorical_dichotomous_column_names:\n",
    "    if column_name == 'TARGET':\n",
    "        continue\n",
    "    df_crosstab = pd.crosstab(index=df['TARGET'], columns=df[column_name])\n",
    "    print(df_crosstab)\n",
    "    odds_ratio, p_value = stats.fisher_exact(df_crosstab)\n",
    "    print(odds_ratio)\n",
    "    print(p_value)\n",
    "    chi_square_statistic, p_value, degrees_of_freedom, expected_frequencies = stats.chi2_contingency(df_crosstab)\n",
    "    print(chi_square_statistic)\n",
    "    print(p_value)\n",
    "    print(degrees_of_freedom)\n",
    "    print(expected_frequencies)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
